rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  555039 29.7    1242597 66.4   686457 36.7
## Vcells 1017255  7.8    8388608 64.0  1876641 14.4
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)


`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )

pss_long[pss_long == ""] = "-"
gn[is.na(gn)] = "-"
sn[is.na(sn)] = "-"

ath.annot = pss_long %>%
  dplyr::full_join(gn, by = c("id" = "V1")) %>%
  dplyr::full_join(sn, by = c("id" = "V1"))

Note: 35.2 bin matcheswill be ignored

3 Abbreviations

Plant Name Label JCVI-MCScan Compara Plants Plaza OrthoDB FastOma RBH Mercator
Malus domestica apple mdo_GDv1 malus_domestica_golden mdo mdo mdo mdo mdo
mdo_HChap1
Prunus persica ppe ppe prunus_persica ppe ppe pper ppe pper
Prunus dulcis / P. amygdalus almond almond prunus_dulcis pdul pdul pdul pdul
Prunus avium wild cherry wildcherry prunus_avium pavi pavi pavi pavi
Prunus armeniaca apricot apricot parm parm parm parm
Prunus cerasifera cherry plum cherryplum pcer pcer pcer
Pyrus pear pear pcox pcox pcox
Prunus sibirica Siberian apricot siberianapricot psib psib psib psib
group = "fruitTrees"
params_list <- list(
  plantName = 'mdo'
  , plantNameOut = "apple"
  , plantDirOut = file.path('..', 'reports', group, "apple")
  , flag = 1
)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000002603af3d5e8>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-10] | |…… | 11% | |…….. | 15% [unnamed-chunk-11] | |……… | 19% | |……….. | 22% [unnamed-chunk-12] | |…………. | 26% | |…………… | 30% [unnamed-chunk-13] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-14] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-15] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-16] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-17] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-18] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-19] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-20] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-21] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-22] | |……………………………………………| 100%

cat(child_content)

4 Subsection: mdo

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  20997 77128 32036 56069 30068 37682
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID                   to_plant source 
##    <chr>       <chr>      <chr>                       <chr>    <chr>  
##  1 AT4G10330   ath        Maldo.hc.v1a1.ch10A.g00003  mdo      FastOMA
##  2 AT4G10340   ath        Maldo.hc.v1a1.ch10A.g00004  mdo      FastOMA
##  3 AT5G44410   ath        g1                          mdo      FastOMA
##  4 AT5G44440   ath        g1                          mdo      FastOMA
##  5 AT1G03770   ath        Maldo.hc.v1a1.ch10A.g02825  mdo      MCScanX
##  6 AT1G03800   ath        Maldo.hc.v1a1.ch10A.g02815  mdo      MCScanX
##  7 ATMG00910   ath        Maldo.hc.v1a1.sc45A.g49878  mdo      MCScanX
##  8 ATMG01020   ath        Maldo.hc.v1a1.sc45A.g49883  mdo      MCScanX
##  9 AT2G46320   ath        Maldo.hc.v1a1.ch1A.g26266   mdo      OrthoDB
## 10 AT4G27940   ath        Maldo.hc.v1a1.ch1A.g26266   mdo      OrthoDB
## 11 AT2G07695   ath        Maldo.hc.v1a1.sc164A.g48922 mdo      OrthoDB
## 12 AT2G07695   ath        Maldo.hc.v1a1.sc119A.g48697 mdo      OrthoDB
## 13 AT2G28190   ath        Maldo.hc.v1a1.ch6A.g38979   mdo      PLAZA  
## 14 AT2G07732   ath        Maldo.hc.v1a1.sc90A.g49976  mdo      PLAZA  
## 15 AT5G17400   ath        Maldo.hc.v1a1.ch9A.g48118   mdo      PLAZA  
## 16 AT2G47300   ath        Maldo.hc.v1a1.ch17A.g24110  mdo      PLAZA  
## 17 AT1G01020   ath        Maldo.hc.v1a1.ch7A.g41990   mdo      RBH    
## 18 AT1G01030   ath        Maldo.hc.v1a1.ch1A.g25187   mdo      RBH    
## 19 ATMG01410   ath        Maldo.hc.v1a1.sc36A.g49760  mdo      RBH    
## 20 ATMG01410   ath        Maldo.hc.v1a1.sc71A.g49908  mdo      RBH    
## 21 AT4G39400   ath        Maldo.hc.v1a1.ch2A.g27501   mdo      compara
## 22 AT4G39400   ath        Maldo.hc.v1a1.ch15A.g17036  mdo      compara
## 23 AT1G65880   ath        Maldo.hc.v1a1.ch17A.g24066  mdo      compara
## 24 AT5G52820   ath        Maldo.hc.v1a1.ch17A.g24067  mdo      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  1962915 104.9    3293471 175.9  3125948 167.0
## Vcells 19558200 149.3   31783875 242.5 31746152 242.3

4.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23046
length(unique(dt$to_geneID))
## [1] 34410
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   20997   77128   32036   56069   30068   37682
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

4.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
##  FALSE   TRUE 
## 117481     27
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
##  [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   7350008 392.6   13692148  731.3  13504928  721.3
## Vcells 125473339 957.3  195447237 1491.2 195431551 1491.1

4.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 23180

4.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 23046 
##      # mdo unigue genes: 34410 
##      # ath highest connection degree:  128 
##      # mdo highest connection degree:  116 
##      # genes in ath with degree > 30:  319 
##      # genes in mdo with degree > 30:  288
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          72085          23180          13328           8915
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       36847      10783    10916           7960
##   2        9865       3217     1663            576
##   3        6786       2076      473            178
##   4        6739       2218      150            104
##   5        7352       2776       96             69
##   6        4496       2110       30             28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
## 117508
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

4.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          30593          11555           2976           1019
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        2523       1870     1615            345
##   2        5377       1399      709            352
##   3        5006       1479      383            135
##   4        6010       2003      144             95
##   5        7181       2694       95             64
##   6        4496       2110       30             28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19719 
##      # mdo unigue genes: 29104 
##      # ath highest connection degree:  44 
##      # mdo highest connection degree:  45 
##      # genes in ath with degree > 30:  7 
##      # genes in mdo with degree > 30:  4
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 32036
## 2:                     compara  5278
## 3:                       PLAZA  6568
## 4: OrthoDB_FastOMA_RBH_MapMan4   444
## 5:     FastOMA_OrthoDB_MapMan4   822
## 6:         OrthoDB_RBH_MapMan4   485
## 7:         FastOMA_RBH_MapMan4   510
## 8:                      reject 71365

4.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                         163                          61 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          27                        1336 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          23                          25 
##                       PLAZA                      reject 
##                         264                        2046
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'ppe'
  , plantNameOut = "peach"
  , plantDirOut = file.path('..', 'reports', group, "peach")
  , flag = 1
)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000260c8320e08>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-38] | |…… | 11% | |…….. | 15% [unnamed-chunk-39] | |……… | 19% | |……….. | 22% [unnamed-chunk-40] | |…………. | 26% | |…………… | 30% [unnamed-chunk-41] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-42] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-43] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-44] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-45] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-46] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-47] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-48] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-49] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-50] | |……………………………………………| 100%

cat(child_content)

5 Subsection: ppe

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  16193 44006 17894 38370 20791 24564
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT1G12040   ath        Prupe.1G000500 ppe      FastOMA
##  2 AT1G62440   ath        Prupe.1G000500 ppe      FastOMA
##  3 AT1G61010   ath        Prupe.I006100  ppe      FastOMA
##  4 AT1G61010   ath        Prupe.I006200  ppe      FastOMA
##  5 AT1G04230   ath        Prupe.1G027200 ppe      MCScanX
##  6 AT1G04240   ath        Prupe.1G027500 ppe      MCScanX
##  7 ATCG00530   ath        Prupe.7G029500 ppe      MCScanX
##  8 ATCG00680   ath        Prupe.7G029800 ppe      MCScanX
##  9 AT3G17900   ath        Prupe.1G267800 ppe      OrthoDB
## 10 AT4G35230   ath        Prupe.1G355500 ppe      OrthoDB
## 11 AT2G07675   ath        Prupe.6G146800 ppe      OrthoDB
## 12 ATMG00980   ath        Prupe.6G146800 ppe      OrthoDB
## 13 AT3G02890   ath        Prupe.2G329000 ppe      PLAZA  
## 14 AT5G16680   ath        Prupe.2G329000 ppe      PLAZA  
## 15 AT4G03170   ath        Prupe.4G260600 ppe      PLAZA  
## 16 AT4G03160   ath        Prupe.4G260600 ppe      PLAZA  
## 17 AT1G01030   ath        Prupe.5G134900 ppe      RBH    
## 18 AT1G01040   ath        Prupe.2G200900 ppe      RBH    
## 19 ATMG01250   ath        Prupe.6G123900 ppe      RBH    
## 20 ATMG01250   ath        Prupe.7G164000 ppe      RBH    
## 21 AT5G01010   ath        Prupe.6G300700 ppe      compara
## 22 AT5G01020   ath        Prupe.6G300300 ppe      compara
## 23 AT1G80940   ath        Prupe.3G103600 ppe      compara
## 24 AT1G80950   ath        Prupe.1G382400 ppe      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4886860 261.0   10953719  585.0  13692148  731.3
## Vcells 116113180 885.9  195447237 1491.2 195446431 1491.2

5.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23113
length(unique(dt$to_geneID))
## [1] 21245
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   16193   44006   17894   38370   20791   24564
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

5.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE  TRUE 
## 71222   231
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
##   [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
##   [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
##  [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
##  [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
##  [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
##  [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  5562657 297.1   10953719  585.0  13692148  731.3
## Vcells 90359306 689.4  195447237 1491.2 195447236 1491.2

5.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 16428

5.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 23113 
##      # ppe unigue genes: 21245 
##      # ath highest connection degree:  57 
##      # ppe highest connection degree:  115 
##      # genes in ath with degree > 30:  264 
##      # genes in ppe with degree > 30:  135
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          43931          16428           6360           4734
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       21126       7929     5410           4119
##   2        6460       2272      560            368
##   3        4106       1255      185            103
##   4        4182       1339      103             68
##   5        4892       1998       68             51
##   6        3165       1635       34             25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  71453
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

5.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          19587           7852           1432            657
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1873       1402      853            215
##   2        3467        929      260            222
##   3        2797        808      125             81
##   4        3607       1168       92             63
##   5        4678       1910       68             51
##   6        3165       1635       34             25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19239 
##      # ppe unigue genes: 18731 
##      # ath highest connection degree:  40 
##      # ppe highest connection degree:  19 
##      # genes in ath with degree > 30:  2 
##      # genes in ppe with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 17894
## 2:                     compara  5084
## 3:                       PLAZA  4926
## 4: OrthoDB_FastOMA_RBH_MapMan4   317
## 5:     FastOMA_OrthoDB_MapMan4   731
## 6:         OrthoDB_RBH_MapMan4   359
## 7:         FastOMA_RBH_MapMan4   217
## 8:                      reject 41925

5.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                         145                          34 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          11                         728 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          25                          16 
##                       PLAZA                      reject 
##                         166                        1269
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pdul'
  , plantNameOut = "almond"
  , plantDirOut = file.path('..', 'reports', group, "almond")
  , flag = 2
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000260c64465b0>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-66] | |…… | 11% | |…….. | 15% [unnamed-chunk-67] | |……… | 19% | |……….. | 22% [unnamed-chunk-68] | |…………. | 26% | |…………… | 30% [unnamed-chunk-69] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-70] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-71] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-72] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-73] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-74] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-75] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-76] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-77] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-78] | |……………………………………………| 100%

cat(child_content)

6 Subsection: pdul

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

6.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  15975 42927 20148 35734 24829
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT2G05620   ath        TexasF1_G1000  pdul     FastOMA
##  2 AT3G48880   ath        TexasF1_G10001 pdul     FastOMA
##  3 AT3G48890   ath        TexasF1_G9999  pdul     FastOMA
##  4 AT5G52240   ath        TexasF1_G9999  pdul     FastOMA
##  5 AT1G04220   ath        TexasF1_G767   pdul     MCScanX
##  6 AT1G04230   ath        TexasF1_G779   pdul     MCScanX
##  7 ATCG00170   ath        TexasF1_G22620 pdul     MCScanX
##  8 ATCG00500   ath        TexasF1_G22624 pdul     MCScanX
##  9 AT1G23390   ath        TexasF1_G3359  pdul     OrthoDB
## 10 AT5G19210   ath        TexasF1_G2060  pdul     OrthoDB
## 11 AT3G51810   ath        TexasF1_G23162 pdul     OrthoDB
## 12 AT2G28815   ath        TexasF1_G6420  pdul     OrthoDB
## 13 AT1G01030   ath        TexasF1_G18833 pdul     RBH    
## 14 AT1G01040   ath        TexasF1_G9057  pdul     RBH    
## 15 ATMG00860   ath        TexasF1_G25095 pdul     RBH    
## 16 ATMG01250   ath        TexasF1_G22012 pdul     RBH    
## 17 AT4G37540   ath        TexasF1_G22582 pdul     compara
## 18 AT5G67420   ath        TexasF1_G22582 pdul     compara
## 19 AT4G15960   ath        TexasF1_G27715 pdul     compara
## 20 AT3G45140   ath        TexasF1_G6796  pdul     compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  4014451 214.4    8762976  468  13692148  731.3
## Vcells 83929985 640.4  156357790 1193 195447236 1491.2

6.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22451
length(unique(dt$to_geneID))
## [1] 20903
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##   15975   42927   20148   35734   24829
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

6.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

6.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

6.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

6.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "compara"         "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "all_pathways"    "short_name"      "athName"        
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 67030
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  3898023 208.2    8762976  468  13692148  731.3
## Vcells 57539806 439.0  156357790 1193 195447236 1491.2

6.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 14567

6.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22451 
##      # pdul unigue genes: 20903 
##      # ath highest connection degree:  150 
##      # pdul highest connection degree:  113 
##      # genes in ath with degree > 30:  171 
##      # genes in pdul with degree > 30:  120
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          39774          14567           7625           5064
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       19498       7186     6154           4134
##   2        6174       1991      848            468
##   3        4138       1446      289            176
##   4        4954       1821      187            146
##   5        5010       2123      147            140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  67030
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

6.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          17566           6240           1707            974
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1171        694      690            253
##   2        3490        699      461            301
##   3        3215        994      230            144
##   4        4680       1730      179            136
##   5        5010       2123      147            140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 18482 
##      # pdul unigue genes: 17119 
##      # ath highest connection degree:  22 
##      # pdul highest connection degree:  18 
##      # genes in ath with degree > 30:  0 
##      # genes in pdul with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 20148
## 2:                     compara  4234
## 3: OrthoDB_FastOMA_RBH_MapMan4   510
## 4:     FastOMA_OrthoDB_MapMan4   697
## 5:         OrthoDB_RBH_MapMan4   474
## 6:         FastOMA_RBH_MapMan4   424
## 7:                      reject 40543

6.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                         118                          47 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          18                         843 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          42                          17 
##                      reject 
##                        1320
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pavi'
  , plantNameOut = "wildcherry"
  , plantDirOut = file.path('..', 'reports', group, "wildcherry")
  , flag = 2
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000260c45bc740>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-94] | |…… | 11% | |……. | 15% [unnamed-chunk-95] | |……… | 19% | |……….. | 22% [unnamed-chunk-96] | |…………. | 26% | |…………… | 30% [unnamed-chunk-97] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-98] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-99] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-100] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-101] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-102] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-103] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-104] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-105] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-106] | |…………………………………………..| 100%

cat(child_content)

7 Subsection: pavi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

7.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  4367 45924 19708 38228 25594
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_plant to_geneID  to_plant source 
##    <chr>       <chr>      <chr>      <chr>    <chr>  
##  1 AT1G12040   ath        FUN_000050 pavi     FastOMA
##  2 AT1G62440   ath        FUN_000050 pavi     FastOMA
##  3 AT2G43840   ath        FUN_040335 pavi     FastOMA
##  4 AT2G44050   ath        FUN_040336 pavi     FastOMA
##  5 AT1G04220   ath        FUN_000300 pavi     MCScanX
##  6 AT1G04230   ath        FUN_000316 pavi     MCScanX
##  7 ATMG01190   ath        FUN_026738 pavi     MCScanX
##  8 ATMG00910   ath        FUN_040205 pavi     MCScanX
##  9 AT4G39370   ath        FUN_020728 pavi     OrthoDB
## 10 AT3G06350   ath        FUN_020749 pavi     OrthoDB
## 11 AT4G24220   ath        FUN_029917 pavi     OrthoDB
## 12 AT4G24220   ath        FUN_029968 pavi     OrthoDB
## 13 AT1G01030   ath        FUN_025493 pavi     RBH    
## 14 AT1G01040   ath        FUN_011748 pavi     RBH    
## 15 ATMG01250   ath        FUN_040221 pavi     RBH    
## 16 ATMG01360   ath        FUN_026804 pavi     RBH    
## 17 AT2G22690   ath        FUN_021390 pavi     compara
## 18 AT4G37880   ath        FUN_021390 pavi     compara
## 19 AT4G39770   ath        FUN_021012 pavi     compara
## 20 AT4G21740   ath        FUN_032538 pavi     compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  3137040 167.6    8762977  468  13692148  731.3
## Vcells 53926859 411.5  156357790 1193 195447236 1491.2

7.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22167
length(unique(dt$to_geneID))
## [1] 21950
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##    4367   45924   19708   38228   25594
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

7.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

7.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

7.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

7.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "compara"         "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "all_pathways"    "short_name"      "athName"        
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE  TRUE 
## 71643     2
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  3957200 211.4    8762977  468  13692148  731.3
## Vcells 58643052 447.5  156357790 1193 195447236 1491.2

7.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 15130

7.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22167 
##      # pavi unigue genes: 21950 
##      # ath highest connection degree:  122 
##      # pavi highest connection degree:  115 
##      # genes in ath with degree > 30:  242 
##      # genes in pavi with degree > 30:  131
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          41733          15130           8890           5892
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       20986       7805     7489           4881
##   2        7358       2253      946            572
##   3        6191       2096      298            243
##   4        6023       2400      131            163
##   5        1175        576       26             33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  71645
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

7.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          18503           5202           1518           1055
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1074        513      732            249
##   2        4956        670      415            395
##   3        5345       1075      216            215
##   4        5953       2368      129            163
##   5        1175        576       26             33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 17405 
##      # pavi unigue genes: 16539 
##      # ath highest connection degree:  31 
##      # pavi highest connection degree:  16 
##      # genes in ath with degree > 30:  2 
##      # genes in pavi with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 19708
## 2:                     compara  1309
## 3: OrthoDB_FastOMA_RBH_MapMan4  2158
## 4:     FastOMA_OrthoDB_MapMan4  1457
## 5:         OrthoDB_RBH_MapMan4   923
## 6:         FastOMA_RBH_MapMan4   723
## 7:                      reject 45367

7.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##                     compara     FastOMA_OrthoDB_MapMan4 
##                          47                          65 
##         FastOMA_RBH_MapMan4                     MCScanX 
##                          34                         789 
## OrthoDB_FastOMA_RBH_MapMan4         OrthoDB_RBH_MapMan4 
##                          77                          54 
##                      reject 
##                        1683
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'parm'
  , plantNameOut = "apricot"
  , plantDirOut = file.path('..', 'reports', group, "apricot")
  , flag = 3
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000260b994a690>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-122] | |…… | 11% | |……. | 15% [unnamed-chunk-123] | |……… | 19% | |……….. | 22% [unnamed-chunk-124] | |…………. | 26% | |…………… | 30% [unnamed-chunk-125] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-126] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-127] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-128] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-129] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-130] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-131] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-132] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-133] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-134] | |…………………………………………..| 100%

cat(child_content)

8 Subsection: parm

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

8.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  43038 18615 52084 25259
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_plant to_geneID       to_plant source 
##    <chr>       <chr>      <chr>           <chr>    <chr>  
##  1 AT1G12040   ath        PruarM.1G000500 parm     FastOMA
##  2 AT1G62440   ath        PruarM.1G000500 parm     FastOMA
##  3 AT2G47410   ath        PruarM.8G368500 parm     FastOMA
##  4 AT4G02060   ath        PruarM.8G368700 parm     FastOMA
##  5 AT1G04400   ath        PruarM.1G051900 parm     MCScanX
##  6 AT1G04410   ath        PruarM.1G052500 parm     MCScanX
##  7 AT5G64410   ath        PruarM.8G146300 parm     MCScanX
##  8 AT5G64410   ath        PruarM.8G146200 parm     MCScanX
##  9 AT5G10270   ath        PruarM.1G279700 parm     OrthoDB
## 10 AT5G64960   ath        PruarM.1G279700 parm     OrthoDB
## 11 AT2G15790   ath        PruarM.8G195500 parm     OrthoDB
## 12 AT4G34660   ath        PruarM.8G163400 parm     OrthoDB
## 13 AT1G01010   ath        PruarM.2G368400 parm     RBH    
## 14 AT1G01030   ath        PruarM.5G193300 parm     RBH    
## 15 ATMG01360   ath        PruarM.4G189900 parm     RBH    
## 16 ATMG01360   ath        PruarM.4G190100 parm     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  3114878 166.4    8762977  468  13692148  731.3
## Vcells 52632599 401.6  156357790 1193 195447236 1491.2

8.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22351
length(unique(dt$to_geneID))
## [1] 22551
table(dt$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   43038   18615   52084   25259
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

8.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

8.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

8.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

8.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "from_count"      "to_count"       
##  [9] "count_evidence"  "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION"
## [13] "all_pathways"    "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 84042
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells  3298180 176.2    8762977  468  13692148  731.3
## Vcells 46553594 355.2  156357790 1193 195447236 1491.2

8.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 27414

8.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22351 
##      # parm unigue genes: 22551 
##      # ath highest connection degree:  267 
##      # parm highest connection degree:  113 
##      # genes in ath with degree > 30:  344 
##      # genes in parm with degree > 30:  392
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          40370          27414          10212           6046
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       20994      20423     8961           4950
##   2        7132       2400      896            600
##   3        6352       2247      227            306
##   4        5892       2344      128            190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  84042
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

8.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          17465           4442           1221           1153
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1147        545      606            253
##   2        4818        534      342            438
##   3        5608       1019      145            272
##   4        5892       2344      128            190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 16773 
##      # parm unigue genes: 15078 
##      # ath highest connection degree:  29 
##      # parm highest connection degree:  19 
##      # genes in ath with degree > 30:  0 
##      # genes in parm with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 18615
## 2: OrthoDB_FastOMA_RBH_MapMan4  2608
## 3:     FastOMA_OrthoDB_MapMan4  1415
## 4:         OrthoDB_RBH_MapMan4   893
## 5:         FastOMA_RBH_MapMan4   750
## 6:                      reject 59761

8.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##     FastOMA_OrthoDB_MapMan4         FastOMA_RBH_MapMan4 
##                          91                          26 
##                     MCScanX OrthoDB_FastOMA_RBH_MapMan4 
##                         768                         115 
##         OrthoDB_RBH_MapMan4                      reject 
##                          53                        1822
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pcox'
  , plantNameOut = "pear"
  , plantDirOut = file.path('..', 'reports', group, "pear")
  , flag = 4
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000260bf88ac80>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-150] | |…… | 11% | |……. | 15% [unnamed-chunk-151] | |……… | 19% | |……….. | 22% [unnamed-chunk-152] | |…………. | 26% | |…………… | 30% [unnamed-chunk-153] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-154] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-155] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-156] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-157] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-158] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-159] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-160] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-161] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-162] | |…………………………………………..| 100%

cat(child_content)

9 Subsection: pcox

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

9.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  75969 33983 36090
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID                     to_plant source 
##    <chr>       <chr>      <chr>                         <chr>    <chr>  
##  1 AT1G20520   ath        Pyrco.da.v2a1.augustus.000230 pcox     FastOMA
##  2 AT1G76210   ath        Pyrco.da.v2a1.augustus.000230 pcox     FastOMA
##  3 AT5G53090   ath        Pyrco.da.v2a1.snap.445350     pcox     FastOMA
##  4 AT5G53100   ath        Pyrco.da.v2a1.snap.445350     pcox     FastOMA
##  5 AT1G03900   ath        Pyrco.da.v2a1.chr10A.089110   pcox     MCScanX
##  6 AT1G03920   ath        Pyrco.da.v2a1.chr10A.089090   pcox     MCScanX
##  7 AT5G56870   ath        Pyrco.da.v2a1.chr9A.225970    pcox     MCScanX
##  8 AT5G57035   ath        Pyrco.da.v2a1.chr9A.225390    pcox     MCScanX
##  9 AT1G01030   ath        Pyrco.da.v2a1.chr14A.371380   pcox     RBH    
## 10 AT1G01030   ath        Pyrco.da.v2a1.chr1A.345960    pcox     RBH    
## 11 ATMG01250   ath        Pyrco.da.v2a1.chr5A.045340    pcox     RBH    
## 12 ATMG01250   ath        Pyrco.da.v2a1.snap.153710     pcox     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2738729 146.3    8762977 468.0  13692148  731.3
## Vcells 39962793 304.9  125086232 954.4 195447236 1491.2

9.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 21603
length(unique(dt$to_geneID))
## [1] 30838
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##   75969   33983   36090
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

9.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

9.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

9.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

9.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "all_pathways"   
## [13] "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 95885
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2935517 156.8    8762977 468.0  13692148  731.3
## Vcells 40502642 309.1  125086232 954.4 195447236 1491.2

9.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 16546

9.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 21603 
##      # pcox unigue genes: 30838 
##      # ath highest connection degree:  136 
##      # pcox highest connection degree:  116 
##      # genes in ath with degree > 30:  261 
##      # genes in pcox with degree > 30:  287
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          60208          16546          10595           8536
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       36081       8117     9410           7873
##   2       12902       4306      975            468
##   3       11225       4123      210            195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  95885
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

9.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          25400           8415           3269            994
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        3024       1653     2355            395
##   2       11151       2639      704            404
##   3       11225       4123      210            195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 17474 
##      # pcox unigue genes: 24456 
##      # ath highest connection degree:  23 
##      # pcox highest connection degree:  20 
##      # genes in ath with degree > 30:  0 
##      # genes in pcox with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##        filter_criteria Count
##                 <fctr> <int>
## 1:             MCScanX 33983
## 2: FastOMA_RBH_MapMan4  4095
## 3:              reject 57807

9.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
## FastOMA_RBH_MapMan4             MCScanX              reject 
##                 161                1459                1684
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pcer'
  , plantNameOut = "cherryplum"
  , plantDirOut = file.path('..', 'reports', group, "cherryplum")
  , flag = 4
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000260d6e47900>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-178] | |…… | 11% | |……. | 15% [unnamed-chunk-179] | |……… | 19% | |……….. | 22% [unnamed-chunk-180] | |…………. | 26% | |…………… | 30% [unnamed-chunk-181] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-182] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-183] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-184] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-185] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-186] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-187] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-188] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-189] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-190] | |…………………………………………..| 100%

cat(child_content)

10 Subsection: pcer

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

10.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  162100 63358 80487
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT2G32760   ath        Pcer_000001-RA pcer     FastOMA
##  2 AT1G07920   ath        Pcer_000002-RA pcer     FastOMA
##  3 AT1G16650   ath        Pcer_097557-RA pcer     FastOMA
##  4 AT1G53530   ath        Pcer_097558-RA pcer     FastOMA
##  5 AT1G04220   ath        Pcer_000137-RA pcer     MCScanX
##  6 AT1G04230   ath        Pcer_000150-RA pcer     MCScanX
##  7 ATMG00080   ath        Pcer_091446-RA pcer     MCScanX
##  8 ATMG00513   ath        Pcer_091469-RA pcer     MCScanX
##  9 AT1G01030   ath        Pcer_027461-RA pcer     RBH    
## 10 AT1G01030   ath        Pcer_038773-RA pcer     RBH    
## 11 ATMG01330   ath        Pcer_091451-RA pcer     RBH    
## 12 ATMG01360   ath        Pcer_096779-RA pcer     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2648330 141.5    8762977 468.0  13692148  731.3
## Vcells 38924177 297.0  125086232 954.4 195447236 1491.2

10.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22197
length(unique(dt$to_geneID))
## [1] 71437
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##  162100   63358   80487
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

10.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

10.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

10.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

10.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "all_pathways"   
## [13] "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
##  FALSE   TRUE 
## 201293      3
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3451314 184.4    8762977 468.0  13692148  731.3
## Vcells 56116124 428.2  125234665 955.5 195447236 1491.2

10.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 40662

10.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22197 
##      # pcer unigue genes: 71437 
##      # ath highest connection degree:  270 
##      # pcer highest connection degree:  113 
##      # genes in ath with degree > 30:  890 
##      # genes in pcer with degree > 30:  449
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##         126211          40662          19225          15198
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       75114      21956    16707          13585
##   2       29613      10371     1995           1240
##   3       21484       8335      523            373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
## 201296
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

10.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          54173          15829           3833           2176
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        6674       2600     2064            701
##   2       26015       4894     1246           1102
##   3       21484       8335      523            373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 17856 
##      # pcer unigue genes: 50879 
##      # ath highest connection degree:  64 
##      # pcer highest connection degree:  20 
##      # genes in ath with degree > 30:  30 
##      # genes in pcer with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##        filter_criteria  Count
##                 <fctr>  <int>
## 1:             MCScanX  63358
## 2: FastOMA_RBH_MapMan4  12653
## 3:              reject 125285

10.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
## FastOMA_RBH_MapMan4             MCScanX              reject 
##                 547                2541                4379
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'psib'
  , plantNameOut = "siberianapricot"
  , plantDirOut = file.path('..', 'reports', group, "siberianapricot")
  , flag = 3
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000026078b404d8>

child_content <- knitr::knit_child("09_translate-child_strict.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child_strict.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-206] | |…… | 11% | |……. | 15% [unnamed-chunk-207] | |……… | 19% | |……….. | 22% [unnamed-chunk-208] | |…………. | 26% | |…………… | 30% [unnamed-chunk-209] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-210] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-211] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-212] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-213] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-214] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-215] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-216] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-217] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-218] | |…………………………………………..| 100%

cat(child_content)

11 Subsection: psib

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

11.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  40732 16398 20889 25288
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_plant to_geneID         to_plant source 
##    <chr>       <chr>      <chr>             <chr>    <chr>  
##  1 AT1G12040   ath        PaF106G0100000005 psib     FastOMA
##  2 AT1G62440   ath        PaF106G0100000005 psib     FastOMA
##  3 AT3G07140   ath        PaF106G0800032954 psib     FastOMA
##  4 AT3G07140   ath        PaF106G0800032956 psib     FastOMA
##  5 AT1G04230   ath        PaF106G0100000358 psib     MCScanX
##  6 AT1G04240   ath        PaF106G0100000361 psib     MCScanX
##  7 ATCG00340   ath        PaF106G0700028636 psib     MCScanX
##  8 ATCG00350   ath        PaF106G0700028635 psib     MCScanX
##  9 AT2G43200   ath        PaF106G0500020125 psib     OrthoDB
## 10 AT1G52770   ath        PaF106G0300013722 psib     OrthoDB
## 11 AT3G56950   ath        PaF106G0700028984 psib     OrthoDB
## 12 AT5G37530   ath        PaF106G0300012640 psib     OrthoDB
## 13 AT1G01030   ath        PaF106G0500020091 psib     RBH    
## 14 AT1G01040   ath        PaF106G0200009357 psib     RBH    
## 15 ATMG01190   ath        PaF106G0600023358 psib     RBH    
## 16 ATMG01250   ath        PaF106G0700028671 psib     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  2825261 150.9    7010383 374.4  13692148  731.3
## Vcells 48771093 372.1  100187732 764.4 195447236 1491.2

11.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22040
length(unique(dt$to_geneID))
## [1] 20342
table(dt$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   40732   16398   20889   25288
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

11.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

11.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

11.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-24-strict.xlsx'), 
                     asTable = TRUE) # change name

11.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "from_count"      "to_count"       
##  [9] "count_evidence"  "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION"
## [13] "all_pathways"    "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 58143
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3065509 163.8    7010383 374.4  13692148  731.3
## Vcells 40880778 311.9  100187732 764.4 195447236 1491.2

11.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 12813

11.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22040 
##      # psib unigue genes: 20342 
##      # ath highest connection degree:  58 
##      # psib highest connection degree:  115 
##      # genes in ath with degree > 30:  135 
##      # genes in psib with degree > 30:  106
dt = as.data.table(df)
dt[, filter_criteria := "reject"]


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          35534          12813           6004           3792
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       18977       6640     5025           3323
##   2        6361       2039      641            312
##   3        6065       2253      249             97
##   4        4131       1881       89             60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}


dt[, filter_criteria := "reject"]
covered_genes = character(nrow(dt))  # reset if needed
print(table(dt$filter_criteria))
## 
## reject 
##  58143
priority_methods =  setdiff(methods, special_methods)
dt[, use := TRUE]
method = NULL
for (method in priority_methods) {
  # rows indices satisfying all conditions; . causes issue!
  eligible_rows <- which(
    dt$filter_criteria == "reject" &
    dt[[method]] == TRUE &
    dt$to_geneID %nin% covered_genes &
    dt$use == TRUE
  )
  if(length(eligible_rows) > 0) {
    # Update FILTER
    dt[eligible_rows, filter_criteria := method]
    # Update covered_genes
    covered_genes = unique(c(covered_genes, dt[eligible_rows, to_geneID]))
    # Update use
    dt[to_geneID %in% covered_genes, use := FALSE]

    
  } else {
    cat("No eligible rows found for:", method, "\n")
  }
}


# uncovered genes
eligible_rows = which(
  dt$filter_criteria == "reject" &
  dt$use == TRUE &
  grepl("match based on", dt$MapMan4_Match, ignore.case = TRUE) # MapMan match
)
if (length(eligible_rows) > 0) {
  sub_dt = dt[eligible_rows] # a temporary snapshot used for fast vectorized computation
  # how many special methods are TRUE per row
  method_matrix = sapply(special_methods, function(m) sub_dt[[m]])
  count_covered = rowSums(method_matrix, na.rm = TRUE)
  new_criteria = rep("reject", length(eligible_rows))
  # For rows with 2 or 3 methods
  for (j in seq_along(eligible_rows)) {
    if (count_covered[j] == 3) {
      new_criteria[j] = "OrthoDB_FastOMA_RBH_MapMan4"
    } else if (count_covered[j] == 2) {
      covered_by = special_methods[method_matrix[j, ]]
      new_criteria[j] = paste0(paste(sort(covered_by), collapse = "_"), "_MapMan4")
    }
  }
  # update
  dt[eligible_rows, filter_criteria := new_criteria]
  dt[eligible_rows, use := FALSE]
} else {
  print("Nothing here!")
}


# table(dt$filter_criteria)
# table(dt[dt$use, MapMan4_Match])
dt[, use := NULL]



df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

11.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          15378           4403           1448            545
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        1154        859      878            174
##   2        4582        627      312            219
##   3        5511       1036      169             92
##   4        4131       1881       89             60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, 
                                # levels = c("MCScanX", "compara", "PLAZA",
                                #                     "OrthoDB_FastOMA_RBH",
                                #                     "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                #                     "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                levels = c("MCScanX", "compara", "PLAZA",
                                           "OrthoDB_FastOMA_RBH_MapMan4",
                                           "FastOMA_OrthoDB_MapMan4", "OrthoDB_RBH_MapMan4", "FastOMA_RBH_MapMan4"
                                           ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3_2025-09-24-strict.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-24-strict.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 15980 
##      # psib unigue genes: 14649 
##      # ath highest connection degree:  22 
##      # psib highest connection degree:  15 
##      # genes in ath with degree > 30:  0 
##      # genes in psib with degree > 30:  0
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  # "OrthoDB_FastOMA_RBH",
  # "OrthoDB_RBH",
  # "FastOMA_OrthoDB",
  # "FastOMA_RBH",
  # "OrthoDB_MapMan4",
  # "RBH_MapMan4",
  # "FastOMA_MapMan4
  "OrthoDB_FastOMA_RBH_MapMan4",
  "FastOMA_OrthoDB_MapMan4", 
  "OrthoDB_RBH_MapMan4", 
  "FastOMA_RBH_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##                filter_criteria Count
##                         <fctr> <int>
## 1:                     MCScanX 16398
## 2: OrthoDB_FastOMA_RBH_MapMan4  2598
## 3:     FastOMA_OrthoDB_MapMan4   731
## 4:         OrthoDB_RBH_MapMan4   595
## 5:         FastOMA_RBH_MapMan4  1452
## 6:                      reject 36369

11.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##     FastOMA_OrthoDB_MapMan4         FastOMA_RBH_MapMan4 
##                          37                          64 
##                     MCScanX OrthoDB_FastOMA_RBH_MapMan4 
##                         674                          93 
##         OrthoDB_RBH_MapMan4                      reject 
##                          33                        1259
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-24-strict.xlsx'), 
                     asTable = TRUE)

12 SessionInfo

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## time zone: Europe/Ljubljana
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_3.5.2      knitr_1.50         data.table_1.17.0 
## [5] magrittr_2.0.3    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.4.1     zip_2.3.2          Rcpp_1.0.14        tidyselect_1.2.1  
##  [9] stringr_1.5.1      dichromat_2.0-0.1  jquerylib_0.1.4    textshaping_1.0.1 
## [13] systemfonts_1.2.3  scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
## [17] R6_2.6.1           labeling_0.4.3     generics_0.1.4     patchwork_1.3.0   
## [21] igraph_2.1.4       openxlsx_4.2.8     tibble_3.2.1       bslib_0.9.0       
## [25] pillar_1.10.2      RColorBrewer_1.1-3 rlang_1.1.5        utf8_1.2.5        
## [29] cachem_1.1.0       stringi_1.8.7      xfun_0.52          sass_0.4.10       
## [33] cli_3.6.3          withr_3.0.2        digest_0.6.37      grid_4.4.1        
## [37] rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.3    
## [41] glue_1.8.0         farver_2.1.2       ragg_1.4.0         colorspace_2.1-1  
## [45] rmarkdown_2.29     tools_4.4.1        pkgconfig_2.0.3    htmltools_0.5.8.1